import cufflinks as cf
import pandas as pd
import numpy as np
import plotly.offline as py
import plotly.graph_objs as go
from IPython.display import display, Markdown, Latex
py.init_notebook_mode(connected=True)
cf.set_config_file(offline=True)
pd.options.display.float_format = '{:20,.2f}'.format
# main data-frame init
dtype = {'County': np.str,
'Employee Name': np.str,
'Job Title': np.str,
'Year': np.int64,
'Notes': np.str,
'Agency': np.str,
'Status': np.str}
def to_float(x):
try:
return float(x)
except Exception as e:
return None
converters = {'Base Pay': to_float,
'Overtime Pay': to_float,
'Other Pay': to_float,
'Benefits': to_float,
'Total Pay': to_float,
'Total Pay & Benefits': to_float}
df = pd.read_csv("data.csv.gz", compression='gzip', dtype=dtype, converters=converters)
# definitions
ENTITIES_MAPPING = {
'Teachers': '(?i)teacher',
'Managers': '(?i)principal|director|chief|intendent',
'Superintendents': '(?i)superintendent'
}
ENTITIES = ['Teachers', 'Managers', 'Superintendents']
YEARS = [2016, 2015, 2014, 2013, 2012]
COUNTIES = ['San Mateo County']
HIGHLIGHT_AGECNY = 'San Mateo-Foster City'
def get_df(county, entity, year):
return df[(df['County'] == county) &
(df['Job Title'].str.contains("{entity}".format(entity=ENTITIES_MAPPING[entity]))) &
(df['Year'] == year)]
def display_mean_salary_by_year(county, entity, year):
ga = get_df(county, entity, year).groupby(['Agency'])['Total Pay & Benefits']
ga_mean = ga.mean()
ga_mean = ga_mean.to_frame().reset_index().sort_values(by='Total Pay & Benefits', ascending=False)
bar = go.Bar(x=ga_mean['Agency'],
y=ga_mean['Total Pay & Benefits'],
marker=dict(color=np.where(ga_mean['Agency'] == HIGHLIGHT_AGECNY, 'rgba(222,45,38,0.8)', 'rgba(204,204,204,1)').tolist()))
layout = go.Layout(
margin=go.Margin(
l=60,
r=50,
b=150,
t=100,
pad=2
),
title='{county}, {entity}, Mean Salary - {year}'.format(county=county,
entity=entity,
year=year),
xaxis=dict(
title='Agency',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Mean Salary (USD)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=[bar], layout=layout)
py.offline.iplot(fig, filename='basic-bar')
"""
ga_mean.iplot(title='{county}, {entity}, Mean Salary - {year}'.format(county=county,
entity=entity,
year=year),
x='Agency', kind='bar', filename='cufflinks/grouped-bar-chart',
xTitle='Agencies', yTitle='Mean Salary', colors={'Agency': 'red', 'Total Pay & Benefits': 'black'})
"""
# print table under graph
ga_descr = ga.describe()
#ga_descr[ga_descr.columns] = ga_descr[ga_descr.columns].fillna(0).astype(np.int64)
display(ga_descr.sort_values(by='mean', ascending=False))
def display_cumul_salary_change(county, entity, year_from, year_to):
dt = df[(df['County'] == county) &
(df['Job Title'].str.contains("{entity}".format(entity=ENTITIES_MAPPING[entity]))) &
((df['Year'] == year_from) | (df['Year'] == year_to))]
dt = dt.groupby(['Agency', 'Year'])['Total Pay & Benefits'].mean().to_frame().reset_index()
dt = dt.pivot_table(columns=['Year'], values='Total Pay & Benefits', index=['Agency']).reset_index()
dt.columns = ['Agency', 'year_from', 'year_to'] # because numbers in column names doesn't work
dt['diff'] = dt.apply(lambda row: (row['year_to'] - row['year_from']) * 100 / row['year_from'], axis=1)
dt = dt[(dt['diff'].notnull())]
dt = dt.sort_values(by='diff', ascending=False)
bar = go.Bar(x=dt['Agency'],
y=dt['diff'],
marker=dict(color=np.where(dt['Agency'] == HIGHLIGHT_AGECNY, 'rgba(222,45,38,0.8)', 'rgba(204,204,204,1)').tolist()))
layout = go.Layout(
margin=go.Margin(
l=60,
r=50,
b=150,
t=100,
pad=2
),
title='{county}, {entity}, Cumulative Mean Salary Change in {year_from} - {year_to}'.format(county=county,
entity=entity,
year_from=year_from,
year_to=year_to),
xaxis=dict(
title='Agency',
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
),
yaxis=dict(
title='Salary Change (%)',
titlefont=dict(
size=16,
color='rgb(107, 107, 107)'
),
tickfont=dict(
size=14,
color='rgb(107, 107, 107)'
)
)
)
fig = go.Figure(data=[bar], layout=layout)
py.offline.iplot(fig, filename='basic-bar')
# Salary change
year_from = min(YEARS)
year_to = max(YEARS)
for county in COUNTIES:
display(Markdown('# {county}, Cumulative Mean Salary Change in {year_from} - {year_to}'.format(county=county,
year_from=year_from,
year_to=year_to)))
for entity in ENTITIES:
display(Markdown('## {entity}'.format(entity=entity)))
display_cumul_salary_change('San Mateo County', entity, year_from, year_to)
# Mean salary
year_from = min(YEARS)
year_to = max(YEARS)
for county in COUNTIES:
display(Markdown('# {county}, Mean Salary in {year_from} - {year_to}'.format(county=county,
year_from=year_from,
year_to=year_to)))
for year in YEARS:
display(Markdown('## {year}'.format(year=year)))
for entity in ENTITIES:
display(Markdown('### {entity}'.format(entity=entity)))
display_mean_salary_by_year(county, entity, year)
All stats are based on: https://transparentcalifornia.com/agencies/salaries/school-districts/